This notebook pulls in data on Nipah RBP entry into CHO-EFNB2 and CHO-EFNB3 cells, filters data, calculates stats, and makes figures¶
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
e2_distances_file = None
func_scores_E2_file = None
func_scores_E3_file = None
filtered_E2_data = None
filtered_E3_data = None
E2_entry_heatmap = None
E3_entry_heatmap = None
E2_E3_entry_corr_plot = None
entry_boxplot_E2_plot = None
entry_boxplot_E3_plot = None
nipah_config = None
altair_config = None
entropy_file = None
In [2]:
# Parameters
func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
e2_distances_file = "results/distances/2vsm_distances.csv"
filtered_E2_data = "results/filtered_data/E2_entry_filtered.csv"
filtered_E3_data = "results/filtered_data/E3_entry_filtered.csv"
E2_entry_heatmap = "results/images/E2_entry_heatmap.html"
E3_entry_heatmap = "results/images/E3_entry_heatmap.html"
E2_E3_entry_corr_plot = "results/images/E2_E3_entry_corr_plot.html"
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
entropy_file = "results/entropy/entropy.csv"
entry_boxplot_E2_plot = "results/images/E2_entry_boxplot.html"
entry_boxplot_E3_plot = "results/images/E3_entry_boxplot.html"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import Bio.SeqIO
import yaml
from Bio import AlignIO
from Bio import PDB
from Bio.Align import PairwiseAligner
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
Setup in correct directory
In [5]:
#hard paths in case don't want to run with snakemake
#e2_distances_file = "results/distances/2vsm_distances.csv"
#func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
#func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
#
#filtered_E2_data = "results/filtered_data/E2_entry_filtered.csv"
#filtered_E3_data = "results/filtered_data/E3_entry_filtered.csv"
#
#E2_entry_heatmap = "results/images/E2_entry_heatmap.html"
#E3_entry_heatmap = "results/images/E3_entry_heatmap.html"
#E2_E3_entry_corr_plot = "results/images/E2_E3_entry_corr.html"
#
#nipah_config = "nipah_config.yaml"
#altair_config = "data/custom_analyses_data/theme.py"
#
#entropy_file = "results/entropy/entropy.csv"
Read in custom altair theme and Import YAML file with parameters¶
In [6]:
if altair_config:
with open(altair_config, 'r') as file:
exec(file.read())
with open(nipah_config) as f:
config = yaml.safe_load(f)
Filter and merge EFNB2 and EFNB3 entry¶
In [7]:
# Import data
func_scores_E2 = pd.read_csv(func_scores_E2_file)
func_scores_E3 = pd.read_csv(func_scores_E3_file)
#func_scores_E2 = pd.read_csv('results/func_effects/averages/CHO_EFNB2_low_func_effects.csv')
#func_scores_E3 = pd.read_csv('results/func_effects/averages/CHO_EFNB3_low_func_effects.csv')
def num_selections_and_filter(df,name):
df = df[df['mutant'] != '-']
#Make a couple plots before filtering
def plot_mean_cell_entry_mutant(df):
mean_df = df.groupby('mutant')['effect'].median().reset_index()
chart = alt.Chart(mean_df).mark_circle(size=100,color='black',opacity=1).encode(
x=alt.X('mutant:O',axis=alt.Axis(grid=True),sort=alt.EncodingSortField(field='effect', order='descending'),title='Mutant Amino Acid'),
y=alt.Y('effect',title=f'Median Cell Entry for {name}',axis=alt.Axis(grid=True,tickCount=4)),
).properties(
height=alt.Step(10),
width=alt.Step(10)
)
return chart.display()
plot_mean_cell_entry_mutant(df)
def plot_times_seen_std(df):
chart = alt.Chart(df).mark_circle(size=20,color='black',opacity=0.2).encode(
x=alt.X('times_seen',axis=alt.Axis(grid=False),title=f'Times Seen for {name}',scale=alt.Scale(type='log')),
y=alt.Y('effect_std',title='Effect Std',axis=alt.Axis(grid=True,tickCount=4)),
tooltip=['site','times_seen','effect_std','effect']
).properties(
height=alt.Step(10),
width=alt.Step(10)
)
return chart.display()
plot_times_seen_std(df)
max_sels = df['n_selections'].max()
num_sels_cutoff = (max_sels/2) + 1
print(f'The number of selections a mutant must be observed in is: {num_sels_cutoff}')
total_mut = (602-71) * 20
filter_test = df[
(df['site'] != 603) &
(df['mutant'] != '-') &
(df['mutant'] != '*')
]
num_variants_pre_filter = filter_test.shape[0]
print(f'Before filtering there are {num_variants_pre_filter} mutants which is {(num_variants_pre_filter/total_mut) * 100:.1f}%')
filter_test_times_seen = filter_test[filter_test['times_seen'] >= config['func_times_seen_cutoff']]
num_variants_times_seen = filter_test_times_seen.shape[0]
print(f"After filtering for {config['func_times_seen_cutoff']} times seen, there are {num_variants_times_seen}, which is {(num_variants_times_seen/total_mut) * 100:.1f}%")
filter_test_effect_std = filter_test_times_seen[filter_test_times_seen['effect_std'] <= config['func_std_cutoff']]
num_variants_std = filter_test_effect_std.shape[0]
print(f"After filtering for {config['func_std_cutoff']} std cutoff, there are {num_variants_std}, which is {(num_variants_std/total_mut) * 100 :.1f}%")
filter_test_n_selections = filter_test_effect_std[filter_test_effect_std['n_selections'] >= num_sels_cutoff]
num_variants_n_selections = filter_test_n_selections.shape[0]
print(f'After filtering for mutants in in all selections, there are {num_variants_n_selections}, which is {(num_variants_n_selections/total_mut) * 100 :.1f}%')
for times_seen in [2.5,3,3.5,4]:
filter_test = filter_test[filter_test['times_seen'] >= times_seen]
print(f'For {times_seen} times seen cutoff, there are {filter_test.shape[0]} mutations, which is {(filter_test.shape[0]/total_mut) * 100 :.1f}%')
max_sels = df['n_selections'].max()
num_sels_cutoff = (max_sels/2) + 1
df = df[
(df['site'] != 603) &
(df['mutant'] != '-') &
(df['mutant'] != '*') &
(df['times_seen'] >= config['func_times_seen_cutoff']) &
(df['effect_std'] <= config['func_std_cutoff']) &
(df['n_selections'] >= num_sels_cutoff)
]
plot_mean_cell_entry_mutant(df)
#Now write filtered_data for mapping
if name == 'E2':
df.to_csv(filtered_E2_data,index=False)
if name == 'E3':
df.to_csv(filtered_E3_data,index=False)
return df
func_scores_E2 = num_selections_and_filter(func_scores_E2,'E2')
func_scores_E3 = num_selections_and_filter(func_scores_E3,'E3')
merged_df = pd.merge(
func_scores_E2,
func_scores_E3,
on=['site','mutant','wildtype'],
how='outer',
suffixes=['_E2','_E3']
)
display(merged_df.describe())
The number of selections a mutant must be observed in is: 5.0 Before filtering there are 10607 mutants which is 99.9% After filtering for 2 times seen, there are 9811, which is 92.4% After filtering for 1 std cutoff, there are 9729, which is 91.6% After filtering for mutants in in all selections, there are 9725, which is 91.6% For 2.5 times seen cutoff, there are 9545 mutations, which is 89.9% For 3 times seen cutoff, there are 9259 mutations, which is 87.2% For 3.5 times seen cutoff, there are 8806 mutations, which is 82.9% For 4 times seen cutoff, there are 8315 mutations, which is 78.3%
The number of selections a mutant must be observed in is: 4.5 Before filtering there are 10607 mutants which is 99.9% After filtering for 2 times seen, there are 9852, which is 92.8% After filtering for 1 std cutoff, there are 9714, which is 91.5% After filtering for mutants in in all selections, there are 9586, which is 90.3% For 2.5 times seen cutoff, there are 9535 mutations, which is 89.8% For 3 times seen cutoff, there are 9168 mutations, which is 86.3% For 3.5 times seen cutoff, there are 8411 mutations, which is 79.2% For 4 times seen cutoff, there are 7775 mutations, which is 73.2%
| site | effect_E2 | effect_std_E2 | times_seen_E2 | n_selections_E2 | effect_E3 | effect_std_E3 | times_seen_E3 | n_selections_E3 | |
|---|---|---|---|---|---|---|---|---|---|
| count | 9885.000000 | 9725.000000 | 9725.000000 | 9725.000000 | 9725.000000 | 9586.000000 | 9586.000000 | 9586.000000 | 9586.000000 |
| mean | 337.222964 | -0.918767 | 0.322007 | 7.196755 | 7.987249 | -0.952189 | 0.344646 | 6.142469 | 6.984769 |
| std | 153.228045 | 1.402207 | 0.235383 | 4.076880 | 0.115810 | 1.458882 | 0.246157 | 3.272205 | 0.139967 |
| min | 71.000000 | -3.518000 | 0.000000 | 2.000000 | 5.000000 | -3.608000 | 0.000000 | 2.000000 | 5.000000 |
| 25% | 205.000000 | -2.214000 | 0.147400 | 4.625000 | 8.000000 | -2.501250 | 0.173000 | 4.143000 | 7.000000 |
| 50% | 338.000000 | -0.266600 | 0.301400 | 6.375000 | 8.000000 | -0.222150 | 0.328000 | 5.429000 | 7.000000 |
| 75% | 470.000000 | 0.219000 | 0.466300 | 8.500000 | 8.000000 | 0.223875 | 0.494650 | 7.143000 | 7.000000 |
| max | 602.000000 | 0.640400 | 1.000000 | 64.380000 | 8.000000 | 0.660100 | 1.000000 | 49.140000 | 7.000000 |
Stats¶
In [8]:
def calculate_stats(df,name):
#df = df[df['times_seen'] > cutoff]
total_mut = (602-71) * 20
print(total_mut)
muts_present = df['effect'].shape[0]
fraction_muts = muts_present / total_mut
print(f'fraction muts present in {name} is {fraction_muts:.2f} {muts_present}/{total_mut}')
deleterious_muts = df[df['effect'] <= -0.25].shape[0]
neutral_muts = df[(df['effect'] > -0.25) & (df['effect'] < 0.25)].shape[0]
positive_muts = df[df['effect'] > 0.25].shape[0]
frac_bad_muts = deleterious_muts / muts_present
frac_neutral_muts = neutral_muts / muts_present
frac_pos_muts = positive_muts / muts_present
print(f'The number of deleterious mutants for {name} is {frac_bad_muts:.2f} {deleterious_muts}/{muts_present}')
print(f'The number of neutral mutants for {name} is {frac_neutral_muts:.2f} {neutral_muts}/{muts_present}')
print(f'The number of positive mutants for {name} is {frac_pos_muts:.2f} {positive_muts}/{muts_present}')
calculate_stats(func_scores_E2,'E2')
calculate_stats(func_scores_E3,'E3')
10620 fraction muts present in E2 is 0.92 9725/10620 The number of deleterious mutants for E2 is 0.50 4906/9725 The number of neutral mutants for E2 is 0.26 2576/9725 The number of positive mutants for E2 is 0.23 2243/9725 10620 fraction muts present in E3 is 0.90 9586/10620 The number of deleterious mutants for E3 is 0.49 4719/9586 The number of neutral mutants for E3 is 0.28 2687/9586 The number of positive mutants for E3 is 0.23 2180/9586
In [9]:
def effect_histogram(df):
base = alt.Chart(df).mark_bar(opacity=0.5).encode(
alt.X(alt.repeat("column"), type='quantitative', bin=alt.Bin(maxbins=10)),
alt.Y('count()', stack=None),
color=alt.Color('Experiment:N')
).properties(
width=300,
height=alt.Step(10)
)
# Create the layered histogram
histogram = alt.layer(
base.encode(alt.X('effect_E2:Q', bin=alt.Bin(maxbins=100),axis=alt.Axis(values=[1, 0, -1, -2, -3, -4]),scale=alt.Scale(domain=[-4,1])), color=alt.value('#1f4e79')),
base.encode(alt.X('effect_E3:Q', bin=alt.Bin(maxbins=100),axis=alt.Axis(values=[1, 0, -1, -2, -3, -4]),scale=alt.Scale(domain=[-4,1])), color=alt.value('#ff7f0e'))
)
return histogram.display()
effect_histogram(merged_df)
In [10]:
def overall_stats_median(df,effect):
mean_df = df.groupby('site')[[effect]].max().reset_index()
total_sites = len(mean_df['site'].unique().tolist())
mean_df = mean_df[mean_df[effect] < 0]
subset = len(mean_df['site'].unique().tolist())
fraction = subset/total_sites
percent = fraction * 100
print(f'The number of sites with median entry score less than 0 for {effect} is : {subset}')
print(f'The percent of sites with median entry score less than 0 for {effect} is : {percent:.0f}')
overall_stats_median(func_scores_E2,'effect')
overall_stats_median(func_scores_E3,'effect')
The number of sites with median entry score less than 0 for effect is : 70 The percent of sites with median entry score less than 0 for effect is : 13 The number of sites with median entry score less than 0 for effect is : 78 The percent of sites with median entry score less than 0 for effect is : 15
How many sites and which sites only have negative entry scores for mutations?¶
In [11]:
def overall_stats_all_neg(df,effect):
filtered_df = df.groupby('site').filter(lambda group: (group[effect] < 0).all())
unique = filtered_df['site'].unique()
print(list(unique))
total_sites = df['site'].unique().shape[0]
subset = filtered_df['site'].unique().shape[0]
fraction = subset/total_sites
percent = fraction * 100
print(f'The total number of sites are: {total_sites}')
print(f' The number of sites where all mutants are negative for {effect}: {subset}')
print(f' The percent of sites where all mutants are negative for {effect}: {percent:.0f}')
return unique
intolerant_sites_E2 = list(overall_stats_all_neg(func_scores_E2,'effect'))
intolerant_sites_E3 = list(overall_stats_all_neg(func_scores_E3,'effect'))
[80, 106, 107, 108, 111, 112, 113, 120, 121, 125, 126, 127, 130, 138, 146, 151, 157, 158, 159, 162, 163, 165, 167, 172, 189, 203, 205, 206, 207, 208, 216, 229, 240, 246, 251, 253, 254, 257, 258, 259, 260, 262, 263, 264, 266, 267, 303, 323, 331, 347, 355, 382, 387, 395, 412, 460, 467, 487, 489, 493, 499, 500, 503, 506, 537, 563, 565, 574, 594, 598] The total number of sites are: 532 The number of sites where all mutants are negative for effect: 70 The percent of sites where all mutants are negative for effect: 13 [95, 100, 106, 107, 108, 111, 112, 113, 121, 125, 126, 138, 146, 158, 162, 163, 201, 203, 206, 207, 216, 229, 240, 243, 248, 251, 253, 257, 258, 259, 260, 263, 266, 303, 347, 352, 355, 368, 382, 387, 389, 395, 412, 439, 458, 460, 467, 486, 487, 489, 493, 494, 497, 499, 500, 501, 503, 504, 505, 506, 510, 526, 531, 532, 533, 537, 556, 557, 563, 565, 573, 574, 579, 581, 584, 585, 588, 594] The total number of sites are: 532 The number of sites where all mutants are negative for effect: 78 The percent of sites where all mutants are negative for effect: 15
In [12]:
def calculate_top_func_scores(df,effect):
percentile_95_effect_E2 = df[effect].quantile(0.999)
cutoff_E2_df_sites = df[df[effect] > percentile_95_effect_E2]
E2_site_cutoff = cutoff_E2_df_sites['site'].unique()
print(f'The sites with the highest functional scores for {effect} are: {list(E2_site_cutoff)}')
calculate_top_func_scores(func_scores_E2,'effect')
calculate_top_func_scores(func_scores_E3,'effect')
The sites with the highest functional scores for effect are: [280, 305, 407, 463, 468, 501, 552, 556, 584, 599] The sites with the highest functional scores for effect are: [183, 315, 337, 358, 378, 393, 418, 419, 554, 597]
Make bubble plots of receptor contact site type (median values per site)¶
In [13]:
def make_boxplot_entry_region(df,effect):# Create a box plot using Altair for aggregated means
barrel_ranges = {
'Hydrophobic': config['hydrophobic'],
'Salt Bridges': config['salt_bridges'],
'Hydrogen Bonds': config['h_bond_total'],
'Contact': config['contact_sites'],
'Overall': list(range(71,602)),
}
mean_df = df.groupby('site')[['effect']].median().reset_index()
custom_order = ['Hydrophobic','Salt Bridges','Hydrogen Bonds','Contact','Overall']
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = mean_df[mean_df['site'].isin(sites)]
for _, row in subset.iterrows():
agg_means.append({'barrel': barrel, 'effect': row['effect'],'site':row['site']})
agg_means_df = pd.DataFrame(agg_means)
if effect == 'E2':
effect_name = 'EFNB2'
else:
effect_name = 'EFNB3'
chart = alt.Chart(agg_means_df).mark_point(color='black',filled=True,size=50,opacity=0.7).encode(
x=alt.X('barrel:O',sort=custom_order,title=None,axis=alt.Axis(labelAngle=-90)),
y=alt.Y('effect',title=f'Median Entry for {effect_name}',axis=alt.Axis(grid=True,tickCount=4)),
xOffset='random:Q',
#color = alt.Color('barrel').legend(None),
tooltip=['barrel', 'effect','site'],
).transform_calculate(
random="sqrt(-1*log(random()))*cos(2*PI*random())"
).properties(
height=200,
width=alt.Step(20)
)
return chart.display()
make_boxplot_entry_region(func_scores_E2,'E2')
make_boxplot_entry_region(func_scores_E3,'E3')
Make bubble plots depending on amino acid property¶
In [14]:
def make_boxplot_entry_region(df,effect):
barrel_ranges = {
'hydrophobic_AA': list(['A','V','L','I','M']),
'aromatic_AA': list(['Y','W','F']),
'positive_AA': list(['K','R','H']),
'negative_AA': list(['E','D']),
'hydrophilic_AA': list(['S','T','N','Q']),
'special_AA': list(['C','P','G']),
}
mean_df = df.groupby(['site','wildtype'])['effect'].median().reset_index()
if effect == 'E2':
effect_name = 'EFNB2'
else:
effect_name = 'EFNB3'
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = mean_df[mean_df['wildtype'].isin(sites)]
for _, row in subset.iterrows():
agg_means.append({'barrel': barrel, 'effect': row['effect'],'site':row['site']})
agg_means_df = pd.DataFrame(agg_means)
chart = alt.Chart(agg_means_df).mark_point(filled=True).encode(
x=alt.X('barrel:O',title=None,axis=alt.Axis(labelAngle=-90)), #sort=custom_order
y=alt.Y('effect',title=f'Median Entry for {effect_name}',axis=alt.Axis(grid=True,tickCount=4)),
xOffset='random:Q',
color = alt.Color('barrel').legend(None),
tooltip=['barrel', 'effect','site'],
).transform_calculate(
random="sqrt(-1*log(random()))*cos(2*PI*random())"
).properties(
height=alt.Step(50),
width=alt.Step(50)
)
return chart.display()
make_boxplot_entry_region(func_scores_E2,'E2')
make_boxplot_entry_region(func_scores_E3,'E3')
Plot correlations between E2 and E3 entry¶
In [15]:
e2_distances = pd.read_csv(e2_distances_file)
distance_df = pd.merge(merged_df,e2_distances[['site','distance']],on='site',how='left')
def determine_distance(df):
# Define the conditions
conditions = [
df['distance'] < 4,
(df['distance'] >= 4) & (df['distance'] <= 8),
df['distance'] > 10
]
# Define the associated values for the conditions
choices = ['contact', 'close', 'distant']
# Apply the conditions and choices to the 'E2_contact' column
df['contact'] = np.select(conditions, choices, default='distant')
return df
distance_df = determine_distance(distance_df)
def median_correlation_plot(df,metric):
aggregation = getattr(df.groupby('site')[['effect_E2', 'effect_E3']], metric)
means = aggregation().reset_index()
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(means['effect_E2'], means['effect_E3'])
#display(r_value)
means = means.rename(columns={"effect_E2": f"{metric}_E2", "effect_E3": f"{metric}_E3"})
contact_sites = df[['site', 'contact','wildtype']].drop_duplicates()
df_mean = pd.merge(means, contact_sites, on='site', how='left')
chart = alt.Chart(df_mean).mark_point(size=40, opacity=0.5,filled=True).encode(
x=alt.X(f"{metric}_E2", title=f"Cell Entry of RBP Mutants with EFNB2 ({metric})"),
y=alt.Y(f"{metric}_E3", title=f"Cell Entry of RBP Mutants with EFNB3 ({metric})"),
tooltip=['site', 'wildtype'],
color=alt.Color('contact', scale=alt.Scale(domain=['contact','close','distant'],range=['#1f4e79','#ff7f0e','gray']), legend=alt.Legend(title='RBP Distance to Receptor'))
).properties(
width=300,
height=300
)
min_ = int(df_mean[f'{metric}_E2'].min())
max_ = int(df_mean[f'{metric}_E3'].max())
text = alt.Chart({'values':[{'x': min_, 'y': max_, 'text': f'r = {r_value:.2f}'}]}).mark_text(
align='left',
baseline='top',
dx=0, # Adjust this for position
dy=0 # Adjust this for position
).encode(
x=alt.X('x:Q'),
y=alt.Y('y:Q'),
text='text:N'
)
plot = chart+text
return plot
E2_E3_plot = median_correlation_plot(distance_df,'sum')
E2_E3_plot.display()
E2_E3_plot.save(E2_E3_entry_corr_plot)
#median_correlation_plot(distance_df,'median')
Make boxplot showing median entry by RBP region¶
In [16]:
def make_boxplot_entry_region(df,effect):
barrel_ranges = {
'Stalk': list(range(96, 147)),
'Neck': list(range(148,165)),
'Linker': list(range(166,177)),
'Head': list(range(178,602)),
'Total': list(range(71,602)),
}
custom_order=['Stalk','Neck','Linker','Head','Total']
mean_df = df.groupby('site')['effect'].median().reset_index()
if effect == 'E2':
effect_name = 'EFNB2'
else:
effect_name = 'EFNB3'
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = mean_df[mean_df['site'].isin(sites)]
for _, row in subset.iterrows():
agg_means.append({'barrel': barrel, 'effect': row['effect'],'site':row['site']})
agg_means_df = pd.DataFrame(agg_means)
chart = alt.Chart(agg_means_df).mark_boxplot(extent='min-max',opacity=1).encode(
x=alt.X('barrel:O',sort=custom_order,title=None,axis=alt.Axis(labelAngle=-90)),
y=alt.Y('effect',title=f'Median Entry for {effect_name}',axis=alt.Axis(grid=True,tickCount=4)),
color = alt.Color('barrel').legend(None),
tooltip=['barrel', 'effect','site'],
).properties(
height=alt.Step(50),
width=alt.Step(50)
)
return chart
entry_boxplot_E2 = make_boxplot_entry_region(func_scores_E2,'E2')
entry_boxplot_E2.display()
entry_boxplot_E2.save(entry_boxplot_E2_plot)
entry_boxplot_E3 = make_boxplot_entry_region(func_scores_E3,'E3')
entry_boxplot_E3.display()
entry_boxplot_E3.save(entry_boxplot_E3_plot)
Make boxplot of functional domains¶
In [17]:
def make_boxplot_func_domains(df,effect):
if effect == 'effect_E2':
effect_name = 'EFNB2'
else:
effect_name = 'EFNB3'
barrel_ranges = {
'Receptor Contact': config['contact_sites'],
'Dimerization' : [160, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 202, 203, 204, 205, 206, 208, 210, 212, 237, 238, 239, 240, 242, 247, 249, 256, 258, 259, 260, 261, 266, 267, 268, 270, 323, 324, 325, 331, 584, 589, 591],
'Overall': list(range(71,602)),
}
custom_order=['Receptor Contact','Dimerization','Overall']
mean_df = df.groupby('site')[[effect]].median().reset_index()
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = mean_df[mean_df['site'].isin(sites)]
for _, row in subset.iterrows():
agg_means.append({'barrel': barrel, 'effect': row[effect],'site':row['site']})
agg_means_df = pd.DataFrame(agg_means)
# Create a box plot using Altair for aggregated means
chart = alt.Chart(agg_means_df).mark_boxplot(extent='min-max',opacity=1).encode(
x=alt.X('barrel:O',sort=custom_order,title='Region',axis=alt.Axis(labelAngle=-90)),
y=alt.Y('effect',title=f'Median Cell Entry for {effect_name}', axis=alt.Axis(grid=True,tickCount=4)),
color = alt.Color('barrel').legend(None),
tooltip=['barrel', 'effect','site'],
#xOffset='random:Q',
).properties(
height=alt.Step(20),
width=alt.Step(20)
)
return chart.display()
make_boxplot_func_domains(merged_df,'effect_E2')
make_boxplot_func_domains(merged_df,'effect_E3')
Generate Full Heatmap For EFNB2 and EFNB3¶
First, prep the data for heatmaps:
Make dataframe from entropy for heatmap
In [18]:
entropy_df = pd.read_csv(entropy_file)
entropy_df = entropy_df[['site','henipavirus_entropy']]
entropy_df = entropy_df.dropna(subset=['site'])
entropy_df['site'] = entropy_df['site'].astype('Int64')
entropy_df = entropy_df.rename(columns={'henipavirus_entropy':'entropy'})
entropy_df = entropy_df[['site','entropy']].drop_duplicates()
entropy_df['mutant'] = 'entropy'
entropy_df['wildtype'] = ''
entropy_df['effect'] = 'effect'
entropy_df.rename(columns={'entropy': 'value'}, inplace=True)
display(entropy_df.head(3))
entropy_sites = entropy_df[entropy_df['value'] == 0]
entropy_sites = list(entropy_sites['site'].unique())
print(entropy_sites) #These are sites conserved across all henipaviruses, used later
| site | value | mutant | wildtype | effect | |
|---|---|---|---|---|---|
| 0 | 1 | -0.00 | entropy | effect | |
| 1 | 2 | 1.75 | entropy | effect | |
| 2 | 3 | 1.75 | entropy | effect |
[1, 17, 23, 38, 63, 97, 101, 104, 105, 107, 114, 119, 124, 146, 148, 151, 153, 158, 160, 162, 165, 181, 189, 203, 216, 220, 222, 229, 231, 233, 235, 240, 253, 254, 263, 282, 295, 324, 345, 355, 365, 382, 393, 395, 398, 439, 440, 454, 455, 457, 459, 460, 462, 479, 485, 486, 487, 488, 493, 494, 499, 500, 503, 506, 508, 509, 510, 524, 526, 528, 530, 532, 533, 534, 535, 540, 546, 561, 565, 566, 573, 574, 575, 579, 588, 589, 590, 598, 601]
Make dataframe for contact sites for heatmap
In [19]:
def make_contact():
df = pd.DataFrame({
'site': config['contact_sites'],
'contact': [0.0] * len(config['contact_sites'])
})
df = df[['site','contact']]
df['mutant'] = 'contact'
df['wildtype'] = ''
df['effect'] = 'effect'
df.rename(columns={'contact':'value'}, inplace=True)
return df
contact_df = make_contact()
display(contact_df.head(3))
| site | value | mutant | wildtype | effect | |
|---|---|---|---|---|---|
| 0 | 239 | 0.0 | contact | effect | |
| 1 | 240 | 0.0 | contact | effect | |
| 2 | 241 | 0.0 | contact | effect |
Make an empty data frame for all sites and mutations, then merge with func_scores so heatmap will have gray sites where no mutations were recorded. Finally, merge in entropy and contact for plotting
In [20]:
def make_empty_df(df):
sites = range(71, 603)
amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
# Create the combination of each site with each amino acid
data = [{'site': site, 'mutant': aa} for site in sites for aa in amino_acids]
# Create the DataFrame
empty_df = pd.DataFrame(data)
all_sites_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
df_test = all_sites_df.melt(id_vars=['site', 'mutant', 'wildtype'],
value_vars=['effect'],
var_name='effect', value_name='value')
df_test = pd.concat([df_test,entropy_df ,contact_df],ignore_index=True) #entropy_df ,contact_df
return df_test
empty_df_E2 = make_empty_df(func_scores_E2)
empty_df_E3 = make_empty_df(func_scores_E3)
display(empty_df_E2)
| site | mutant | wildtype | effect | value | |
|---|---|---|---|---|---|
| 0 | 71 | A | NaN | effect | NaN |
| 1 | 71 | C | Q | effect | -1.843 |
| 2 | 71 | D | Q | effect | -1.312 |
| 3 | 71 | E | Q | effect | -1.137 |
| 4 | 71 | F | Q | effect | -1.187 |
| ... | ... | ... | ... | ... | ... |
| 11270 | 579 | contact | effect | 0.000 | |
| 11271 | 580 | contact | effect | 0.000 | |
| 11272 | 581 | contact | effect | 0.000 | |
| 11273 | 583 | contact | effect | 0.000 | |
| 11274 | 588 | contact | effect | 0.000 |
11275 rows × 5 columns
Make heatmap now with dataframe produced above. I specify range to split the heatmap into four equal rows.
In [21]:
def plot_heatmap_full_update_test(df,legend_title):
# Create y-axis order list
custom_order = ['contact','entropy','R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C'] #custom_order = names + custom_order
final_df = df
#Sites for wrapping heatmap correctly
full_ranges = [list(range(start, end)) for start, end in [(71, 204), (204, 337), (337, 470), (470, 603)]]
# Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
final_df = final_df.sort_values('site')
sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
# Now sort the dataframe by this rank
final_df = final_df.sort_values('mutant_rank')
# Drop the 'mutant_rank' column as it is no longer needed after sorting
final_df = final_df.drop(columns=['mutant_rank'])
sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
# Prepare the color scales separately for entropy and effects
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0, domain=[-4, 2.5])
color_scale_entropy = alt.Scale(scheme='purples', domain=[0, 2],reverse=True)
color_scale_contact = alt.Scale(scheme='purples', domainMid=0, domain=[0, 5],reverse=True)
# set strokewidth to control stroke size around each individual box
strokewidth_size = 0.25
# container to hold the charts
charts = []
# Flags for showing the legend only the first time
effect_legend_added = False
entropy_legend_added = False
for subset in full_ranges:
subset_df = final_df[final_df['site'].isin(subset)] #for the wrapping of sites
unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype']) #for the wildtype mapping
# The chart for the heatmap
base = (
alt.Chart(subset_df)
.encode(
x=alt.X('site:O', title='Site', sort=sites, axis=alt.Axis(labelExpr="datum.value % 10 === 0 ? datum.value : ''",labelAngle=-90,grid=False)),
y=alt.Y('mutant', title='Amino Acid', sort=alt.EncodingSortField(field='sort_order', order='ascending'),axis=alt.Axis(grid=False)),
tooltip=['site','wildtype','mutant','value'],
).properties(
width=alt.Step(10),
height=alt.Step(11) #Have to make slightly taller to fit everything
)
)
# Individual Heatmaps
chart_empty = (
base.mark_rect(color='#d1d3d4').encode().transform_filter( ##bcbec0
(alt.datum.effect == 'effect')
)
)
if not effect_legend_added:
chart_effect = (
base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title=legend_title)),
alt.value('transparent')),
).transform_filter(
(alt.datum.effect == 'effect') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
effect_legend_added = True
else:
chart_effect = (
base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
alt.Color('value:Q', scale=color_scale_effect,legend=None),
alt.value('transparent')),
).transform_filter(
(alt.datum.effect == 'effect') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
if not entropy_legend_added:
chart_entropy = (
base.mark_rect().encode(
color=alt.condition(f'datum.mutant == "entropy"',
alt.Color('value:Q', scale=color_scale_entropy,legend=alt.Legend(title='Henipavirus Entropy')),
alt.value('transparent')),
).transform_filter(
(alt.datum.mutant == 'entropy') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
entropy_legend_added = True
else:
chart_entropy = (
base.mark_rect().encode(
color=alt.condition(f'datum.mutant == "entropy"',
alt.Color('value:Q', scale=color_scale_entropy,legend=None),
alt.value('transparent')),
).transform_filter(
(alt.datum.mutant == 'entropy') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
chart_contact = (
base.mark_rect(color='black').encode(
#color=alt.condition(f'datum.mutant == "contact"',
#alt.Color('value:Q', scale=color_scale_contact,legend=None),
#alt.value('transparent')),
).transform_filter(
(alt.datum.mutant == 'contact') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# The layer for the wildtype boxes
wildtype_layer_box = (
alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
tooltip=['site','wildtype'],
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
#display(unique_wildtypes_df)
# The layer for the wildtype amino acids
wildtype_layer = (
alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8,align='center', baseline='middle').encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
tooltip=['site','wildtype'],
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.mutant != 'entropy') & (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# Combine the heatmap layer with the wildtype layer
chart = alt.layer(chart_empty, chart_effect, chart_entropy, chart_contact, wildtype_layer_box, wildtype_layer).resolve_scale(color='independent')
charts.append(chart)
combined_chart = alt.vconcat(*charts).resolve_scale(y='independent', x='independent',color='shared')
return combined_chart#.display()
E2_entry_heatmap_full = plot_heatmap_full_update_test(empty_df_E2,'CHO-EFNB2 Entry')
E2_entry_heatmap_full.save(E2_entry_heatmap)
E2_entry_heatmap_full.display()
E3_entry_heatmap_full = plot_heatmap_full_update_test(empty_df_E3,'CHO-EFNB3 Entry')
E3_entry_heatmap_full.save(E3_entry_heatmap)
E3_entry_heatmap_full.display()
Find sites that have unique entry scores by amino acid property and show with heatmap¶
In [22]:
def check_muts_by_properties(df,min_group_size,positive_value,negative_value):
letter_to_type = {'D': 'negative',
'E': 'negative',
'K': 'positive',
'R': 'positive',
'H': 'positive',
'C': 'special',
'S': 'hydrophilic',
'T': 'hydrophilic',
'N': 'hydrophilic',
'Q': 'hydrophilic',
'G': 'special',
'A': 'hydrophobic',
'V': 'hydrophobic',
'L': 'hydrophobic',
'I': 'hydrophobic',
'M': 'hydrophobic',
'P': 'special',
'F': 'aromatic',
'Y': 'armomatic',
'W': 'aromatic',
'*': 'stop'}
df['mutant_type'] = df['mutant'].map(letter_to_type)
grouped = df.groupby(['site', 'mutant_type'])
filtered_groups = grouped.filter(lambda x: (x['effect'] > positive_value).all() and len(x) >= min_group_size)
#Positive Mask
positive_mask = (
filtered_groups.groupby('site')['effect'].transform(lambda x: (x > positive_value).all()) &
df.groupby('site')['effect'].transform(lambda x: (x <= positive_value).any())
)
positive_sites = df[positive_mask]
#Negative mask
filtered_groups_neg = grouped.filter(lambda x: (x['effect'] < negative_value).all() and len(x) >= min_group_size)
negative_mask = (
filtered_groups_neg.groupby('site')['effect'].transform(lambda x: (x < negative_value).all()) &
df.groupby('site')['effect'].transform(lambda x: (x <= negative_value).any())
)
negative_sites = df[negative_mask]
common_sites = pd.merge(positive_sites, negative_sites, on='site', how='inner')
final_sites = common_sites['site'].drop_duplicates()
return final_sites
final_sites_E2 = check_muts_by_properties(func_scores_E2,2,-0.25,-2)
final_sites_E3 = check_muts_by_properties(func_scores_E3,2,-0.25,-2)
print(list(final_sites_E2))
print(list(final_sites_E3))
[76, 78, 93, 103, 136, 139, 141, 144, 145, 154, 156, 164, 228, 230, 232, 237, 244, 247, 248, 250, 268, 283, 284, 285, 286, 294, 295, 296, 316, 349, 350, 400, 414, 416, 426, 431, 438, 465, 469, 475, 477, 482, 509, 511, 521, 531, 547, 576] [79, 101, 118, 119, 141, 150, 154, 155, 228, 231, 235, 241, 250, 256, 283, 284, 285, 286, 291, 294, 296, 298, 305, 316, 319, 348, 350, 354, 408, 411, 414, 416, 431, 437, 442, 443, 444, 465, 469, 475, 477, 482, 485, 491, 492, 511, 514, 520, 521, 530, 546, 547, 551, 552, 559, 575, 576, 577, 578, 591]
In [23]:
def plot_heatmap_full_update_test(df,sites_,legend_title):
# Create y-axis order list
custom_order = ['contact','entropy','R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C'] #custom_order = names + custom_order
final_df = df
# Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
final_df = final_df.sort_values('site')
sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
# Now sort the dataframe by this rank
final_df = final_df.sort_values('mutant_rank')
# Drop the 'mutant_rank' column as it is no longer needed after sorting
final_df = final_df.drop(columns=['mutant_rank'])
sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
# Prepare the color scales separately for entropy and effects
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0, domain=[-4, 2.5])
color_scale_entropy = alt.Scale(scheme='purples', domain=[0, 2],reverse=True)
color_scale_contact = alt.Scale(scheme='purples', domainMid=0, domain=[0, 5],reverse=True)
strokewidth_size = 0.25
# container to hold the charts
charts = []
#for subset in full_ranges:
subset_df = final_df[final_df['site'].isin(sites_)] #for the wrapping of sites
unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype']) #for the wildtype mapping
# The chart for the heatmap
base = (
alt.Chart(subset_df)
.encode(
x=alt.X('site:O', title=None, sort=sites, axis=alt.Axis(labelAngle=-90,grid=False)),
y=alt.Y('mutant', title='Amino Acid', sort=alt.EncodingSortField(field='sort_order', order='ascending'),axis=alt.Axis(grid=False)),
tooltip=['site','wildtype','mutant','value'],
).properties(
width=alt.Step(10),
height=alt.Step(11)
)
)
# Individual Heatmaps
chart_empty = (
base.mark_rect(color='#d1d3d4').encode().transform_filter( ##bcbec0
(alt.datum.effect == 'effect')
)
)
chart_effect = (
base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title=legend_title)),
alt.value('transparent')),
).transform_filter(
(alt.datum.effect == 'effect') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
chart_entropy = (
base.mark_rect().encode(
color=alt.condition(f'datum.mutant == "entropy"',
alt.Color('value:Q', scale=color_scale_entropy,legend=None),
alt.value('transparent')),
).transform_filter(
(alt.datum.mutant == 'entropy') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
chart_contact = (
base.mark_rect(color='black').encode(
#color=alt.condition(f'datum.mutant == "contact"',
#alt.Color('value:Q', scale=color_scale_contact,legend=None),
#alt.value('transparent')),
).transform_filter(
(alt.datum.mutant == 'contact') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# The layer for the wildtype boxes
wildtype_layer_box = (
alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
tooltip=['site','wildtype'],
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# The layer for the wildtype amino acids
wildtype_layer = (
alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8,align='center', baseline='middle').encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
tooltip=['site','wildtype'],
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.mutant != 'entropy') & (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# Combine the heatmap layer with the wildtype layer
chart = alt.layer(chart_empty, chart_effect, chart_entropy, chart_contact, wildtype_layer_box, wildtype_layer).resolve_scale(color='independent')
charts.append(chart)
combined_chart = alt.vconcat(*charts).resolve_scale(y='independent', x='independent',color='independent')
return combined_chart.display()
plot_heatmap_full_update_test(empty_df_E2,final_sites_E2,'EFNB2 Entry')
plot_heatmap_full_update_test(empty_df_E3,final_sites_E3,'EFNB3 Entry')
Show heatmap of different wildtype amino acid classes¶
In [24]:
hydrophobic_AA = ['A','V','L','I','M']
aromatic_AA = ['Y','W','F']
positive_AA = ['K','R','H']
negative_AA = ['E','D']
hydrophilic_AA = ['S','T','N','Q']
def find_aa_wildtype_sites(df,aa_type):
aa_list = list(df[df['wildtype'].isin(aa_type)]['site'].unique())
return aa_list
# Find sites where the WT are different classes of amino acids
hydrophobic_AA_list = find_aa_wildtype_sites(func_scores_E3,hydrophobic_AA)
aromatic_AA_list = find_aa_wildtype_sites(func_scores_E3,aromatic_AA)
positive_AA_list = find_aa_wildtype_sites(func_scores_E3,positive_AA)
negative_AA_list = find_aa_wildtype_sites(func_scores_E3,negative_AA)
hydrophilic_AA_list = find_aa_wildtype_sites(func_scores_E3,hydrophilic_AA)
all_AA = [hydrophobic_AA_list,aromatic_AA_list,positive_AA_list,negative_AA_list,hydrophilic_AA_list]
for i in all_AA:
plot_heatmap_full_update_test(empty_df_E2,i,'EFNB2 Entry')
plot_heatmap_full_update_test(empty_df_E3,i,'EFNB3 Entry')
Show heatmap of sites that conserved in all henipaviruses¶
In [25]:
plot_heatmap_full_update_test(empty_df_E2,entropy_sites,'EFNB2 Entry')
plot_heatmap_full_update_test(empty_df_E3,entropy_sites,'EFNB3 Entry')
Show heatmap of sites that only have deleterious mutations¶
In [26]:
plot_heatmap_full_update_test(empty_df_E2,intolerant_sites_E2,'EFNB2 entry')
plot_heatmap_full_update_test(empty_df_E3,intolerant_sites_E3,'EFNB3 entry')
Make heatmap of contact sites¶
In [27]:
def make_df(list_, name_):
#print(name_) # This will print the actual value of name_
df = pd.DataFrame({
'site': list_,
name_: [0.0] * len(list_)
})
df = df[['site', name_]] # Use name_ directly if it's a string
df['mutant'] = name_ # Use name_ directly if it's a string
df['wildtype'] = ''
df['effect'] = 'effect'
df.rename(columns={name_: 'value'}, inplace=True) # Use name_ directly if it's a string
#display(df)
return df
# Example calls to the function
hydrophobic_df = make_df(config['hydrophobic'], 'hydrophobic')
hydrogen_df = make_df(config['h_bond_total'], 'hydrogen bonds')
salt_bridge_df = make_df(config['salt_bridges'], 'salt bridges')
def make_empty_df_contact(df):
sites = range(71, 603)
amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
# Create the combination of each site with each amino acid
data = [{'site': site, 'mutant': aa} for site in sites for aa in amino_acids]
# Create the DataFrame
empty_df = pd.DataFrame(data)
all_sites_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
df_test = all_sites_df.melt(id_vars=['site', 'mutant', 'wildtype'],
value_vars=['effect'],
var_name='effect', value_name='value')
df_test = pd.concat([df_test,hydrophobic_df,hydrogen_df,salt_bridge_df],ignore_index=True)
return df_test
empty_df_E2 = make_empty_df_contact(func_scores_E2)
empty_df_E3 = make_empty_df_contact(func_scores_E3)
def plot_heatmap_full_contact_site(df,legend_title):
# Create y-axis order list
custom_order = ['salt bridges','hydrogen bonds','hydrophobic','R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C'] #custom_order = names + custom_order
final_df = df
# Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
final_df = final_df.sort_values('site')
sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
# Now sort the dataframe by this rank
final_df = final_df.sort_values('mutant_rank')
# Drop the 'mutant_rank' column as it is no longer needed after sorting
final_df = final_df.drop(columns=['mutant_rank'])
sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
# Prepare the color scales separately for entropy and effects
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0, domain=[-4, 2.5])
color_scale_entropy = alt.Scale(scheme='teals', domain=[0, 1],reverse=True)
color_scale_contact = alt.Scale(scheme='purples', domainMid=0, domain=[0, 5],reverse=True)
strokewidth_size = 0.25
# container to hold the charts
charts = []
#for subset in full_ranges:
subset_df = final_df[final_df['site'].isin(config['contact_sites'])] #for the wrapping of sites
unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype']) #for the wildtype mapping
# The chart for the heatmap
base = (
alt.Chart(subset_df)
.encode(
x=alt.X('site:O', title='Site', sort=sites, axis=alt.Axis(labelAngle=-90,grid=False)),
y=alt.Y('mutant', title='Amino Acid', sort=alt.EncodingSortField(field='sort_order', order='ascending'),axis=alt.Axis(grid=False)),
tooltip=['site','wildtype','mutant','value'],
).properties(
width=alt.Step(10),
height=alt.Step(10)
)
)
# Individual Heatmaps
chart_empty = (
base.mark_rect(color='#d1d3d4').encode().transform_filter( ##bcbec0
(alt.datum.effect == 'effect')
)
)
chart_effect = (
base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title=legend_title)),
alt.value('transparent')),
).transform_filter(
(alt.datum.effect == 'effect') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
chart_entropy = (
base.mark_rect().encode(
color=alt.condition(f'datum.mutant == "entropy"',
alt.Color('value:Q', scale=color_scale_entropy,legend=None),
alt.value('transparent')),
).transform_filter(
(alt.datum.mutant == 'entropy') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
chart_other = (
base.mark_rect(color='black').encode(
).transform_filter(
(alt.datum.mutant == 'hydrophobic') | (alt.datum.mutant == 'salt bridges') | (alt.datum.mutant == 'hydrogen bonds')
)
)
# The layer for the wildtype boxes
wildtype_layer_box = (
alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
tooltip=['site','wildtype'],
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# The layer for the wildtype amino acids
wildtype_layer = (
alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8,align='center', baseline='middle').encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
tooltip=['site','wildtype'],
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.mutant != 'entropy') & (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# Combine the heatmap layer with the wildtype layer
chart = alt.layer(chart_empty, chart_effect, chart_entropy, chart_other, wildtype_layer_box, wildtype_layer).resolve_scale(color='independent')
charts.append(chart)
combined_chart = alt.vconcat(*charts).resolve_scale(y='independent', x='independent',color='independent')
return combined_chart.display()
plot_heatmap_full_contact_site(empty_df_E2,'EFNB2 Entry')
plot_heatmap_full_contact_site(empty_df_E3,'EFNB3 Entry')
Plot heatmap of cysteine and n-linked glycosylation motifs¶
In [28]:
def make_empty_df(df):
sites = range(71, 603)
amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
# Create the combination of each site with each amino acid
data = [{'site': site, 'mutant': aa} for site in sites for aa in amino_acids]
# Create the DataFrame
empty_df = pd.DataFrame(data)
all_sites_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
df_test = all_sites_df.melt(id_vars=['site', 'mutant', 'wildtype'],
value_vars=['effect'],
var_name='effect', value_name='value')
df_test = pd.concat([entropy_df,df_test],ignore_index=True)
return df_test
empty_df_E2 = make_empty_df(func_scores_E2)
empty_df_E3 = make_empty_df(func_scores_E3)
def plot_heatmap_glycan_cysteine(df,sites,legend_title):
# Create y-axis order list
custom_order = ['entropy','R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C'] #custom_order = names + custom_order
final_df = df[df['site'].isin(sites)]
# Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
final_df = final_df.sort_values('site')
sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
# Now sort the dataframe by this rank
final_df = final_df.sort_values('mutant_rank')
# Drop the 'mutant_rank' column as it is no longer needed after sorting
final_df = final_df.drop(columns=['mutant_rank'])
sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
# Prepare the color scales separately for entropy and effects
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0, domain=[-4, 2.5])
color_scale_entropy = alt.Scale(scheme='purples', domain=[0, 2],reverse=True)
color_scale_contact = alt.Scale(scheme='purples', domainMid=0, domain=[0, 5],reverse=True)
strokewidth_size = 0.25
# container to hold the charts
charts = []
#for subset in full_ranges:
subset_df = final_df[final_df['site'].isin(sites)] #for the wrapping of sites
unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype']) #for the wildtype mapping
# The chart for the heatmap
base = (
alt.Chart(subset_df)
.encode(
x=alt.X('site:O', title=None, sort=sites, axis=alt.Axis(labelAngle=-90,grid=False)),
y=alt.Y('mutant', title='Amino Acid', sort=alt.EncodingSortField(field='sort_order', order='ascending'),axis=alt.Axis(grid=False)),
tooltip=['site','wildtype','mutant','value'],
).properties(
width=alt.Step(10),
height=alt.Step(11)
)
)
# Individual Heatmaps
chart_empty = (
base.mark_rect(color='#d1d3d4').encode().transform_filter( ##bcbec0
(alt.datum.effect == 'effect')
)
)
chart_effect = (
base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title=legend_title)),
alt.value('transparent')),
).transform_filter(
(alt.datum.effect == 'effect') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
chart_entropy = (
base.mark_rect().encode(
color=alt.condition(f'datum.mutant == "entropy"',
alt.Color('value:Q', scale=color_scale_entropy,legend=None),
alt.value('transparent')),
).transform_filter(
(alt.datum.mutant == 'entropy') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# The layer for the wildtype boxes
wildtype_layer_box = (
alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
tooltip=['site','wildtype'],
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
#display(unique_wildtypes_df)
# The layer for the wildtype amino acids
wildtype_layer = (
alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8,align='center', baseline='middle').encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
tooltip=['site','wildtype'],
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.mutant != 'entropy') & (alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# Combine the heatmap layer with the wildtype layer
chart = alt.layer(chart_empty, chart_effect, chart_entropy, wildtype_layer_box, wildtype_layer).resolve_scale(color='independent')
charts.append(chart)
combined_chart = alt.vconcat(*charts).resolve_scale(y='independent', x='independent',color='independent')
return combined_chart.display()
cysteine_neck = [146, 158, 162]
cysteine_1 = [189,601]
cysteine_2 = [216, 240]
cysteine_3 = [282,295]
cysteine_4 = [382, 395]
cysteine_5 = [387, 499]
cysteine_6 = [493, 503]
cysteine_7 = [565, 574]
cysteine = cysteine_neck + cysteine_1 + cysteine_2 + cysteine_3 + cysteine_4 + cysteine_5 + cysteine_6 + cysteine_7
n_linked = config['glycans']
stalk = list(range(96, 147))
neck = list(range(148,166))
linker = list(range(166,177))
for sites in [cysteine,n_linked,stalk,neck,linker]:
plot_heatmap_glycan_cysteine(empty_df_E2,sites,'EFNB2 Entry')
plot_heatmap_glycan_cysteine(empty_df_E3,sites,'EFNB3 Entry')
Check for potential neutral/beneficial glycosylation sites¶
In [29]:
def find_potential_glycan_sites(df):
filtered = df[df['wildtype'].isin(['T', 'S'])]
matching_sites = []
for index, row in filtered.iterrows():
# Check for existence of site two numbers prior with 'N' mutant and positive effect
prior_rows = df[(df['site'] == row['site'] - 2) & (df['mutant'] == 'N') & (df['effect'] > 0)]
if not prior_rows.empty:
matching_sites.append(row['site'])
unique_list1 = list(set(matching_sites))
unique_list1 = [x - 2 for x in unique_list1]
filtered = df[df['wildtype'].isin(['N'])]
matching_sites = []
for index, row in filtered.iterrows():
# Check for existence of site two numbers prior with 'N' mutant and positive effect
prior_rows = df[(df['site'] == row['site'] + 2) & (df['mutant'].isin(['T','S'])) & (df['effect'] > 0)]
if not prior_rows.empty:
matching_sites.append(row['site'])
unique_list2 = list(set(matching_sites))
unique_list = unique_list1 + unique_list2
unique_list.sort()
print(f'The total number of potential PNLG sites are: {len(unique_list)}')
return unique_list
PNLG = find_potential_glycan_sites(func_scores_E3)
surface = pd.read_csv(config['surface'])
surface.rename(columns={'exposure_A': 'Surface Exposure'},inplace=True)
PNLG_SURFACE = surface[surface['site'].isin(PNLG)]
PNLG_SURFACE = list(PNLG_SURFACE[PNLG_SURFACE['Surface Exposure'] >= 25]['site'].unique())
print(f'The surface exposed PNLG sites are: {PNLG_SURFACE}')
glycans = config['glycans']
filtered_PNLG_SURFACE = [num for num in PNLG_SURFACE if num not in glycans]
print(filtered_PNLG_SURFACE)
print(len(filtered_PNLG_SURFACE))
The total number of potential PNLG sites are: 33 The surface exposed PNLG sites are: [159, 180, 191, 192, 275, 288, 306, 311, 326, 378, 383, 403, 417, 423, 473, 478, 518, 543, 554, 570, 600] [180, 191, 192, 275, 288, 311, 326, 383, 403, 423, 473, 478, 518, 543, 554, 570, 600] 17
In [ ]:
In [ ]: